library(tidyverse)
library(lubridate)
library(crosstable)
library(dplyr)
library(lubridate)
library(TSstudio)
data_url <- 'https://data.cityofnewyork.us/api/views/833y-fsy8/rows.csv?accessType=DOWNLOAD'
nypd_shootings <- read_csv(data_url)
# Get a summary
summary(nypd_shootings)
## INCIDENT_KEY OCCUR_DATE OCCUR_TIME BORO
## Min. : 9953245 Length:25596 Length:25596 Length:25596
## 1st Qu.: 61593633 Class :character Class1:hms Class :character
## Median : 86437258 Mode :character Class2:difftime Mode :character
## Mean :112382648 Mode :numeric
## 3rd Qu.:166660833
## Max. :238490103
##
## PRECINCT JURISDICTION_CODE LOCATION_DESC STATISTICAL_MURDER_FLAG
## Min. : 1.00 Min. :0.0000 Length:25596 Mode :logical
## 1st Qu.: 44.00 1st Qu.:0.0000 Class :character FALSE:20668
## Median : 69.00 Median :0.0000 Mode :character TRUE :4928
## Mean : 65.87 Mean :0.3316
## 3rd Qu.: 81.00 3rd Qu.:0.0000
## Max. :123.00 Max. :2.0000
## NA's :2
## PERP_AGE_GROUP PERP_SEX PERP_RACE VIC_AGE_GROUP
## Length:25596 Length:25596 Length:25596 Length:25596
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## VIC_SEX VIC_RACE X_COORD_CD Y_COORD_CD
## Length:25596 Length:25596 Min. : 914928 Min. :125757
## Class :character Class :character 1st Qu.:1000011 1st Qu.:182782
## Mode :character Mode :character Median :1007715 Median :194038
## Mean :1009455 Mean :207894
## 3rd Qu.:1016838 3rd Qu.:239429
## Max. :1066815 Max. :271128
##
## Latitude Longitude Lon_Lat
## Min. :40.51 Min. :-74.25 Length:25596
## 1st Qu.:40.67 1st Qu.:-73.94 Class :character
## Median :40.70 Median :-73.92 Mode :character
## Mean :40.74 Mean :-73.91
## 3rd Qu.:40.82 3rd Qu.:-73.88
## Max. :40.91 Max. :-73.70
##
str(nypd_shootings, give.attr = FALSE)
## spc_tbl_ [25,596 × 19] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ INCIDENT_KEY : num [1:25596] 2.36e+08 2.31e+08 2.31e+08 2.38e+08 2.24e+08 ...
## $ OCCUR_DATE : chr [1:25596] "11/11/2021" "07/16/2021" "07/11/2021" "12/11/2021" ...
## $ OCCUR_TIME : 'hms' num [1:25596] 15:04:00 22:05:00 01:09:00 13:42:00 ...
## $ BORO : chr [1:25596] "BROOKLYN" "BROOKLYN" "BROOKLYN" "BROOKLYN" ...
## $ PRECINCT : num [1:25596] 79 72 79 81 113 113 42 52 34 75 ...
## $ JURISDICTION_CODE : num [1:25596] 0 0 0 0 0 0 0 0 0 0 ...
## $ LOCATION_DESC : chr [1:25596] NA NA NA NA ...
## $ STATISTICAL_MURDER_FLAG: logi [1:25596] FALSE FALSE FALSE FALSE FALSE TRUE ...
## $ PERP_AGE_GROUP : chr [1:25596] NA "45-64" "<18" NA ...
## $ PERP_SEX : chr [1:25596] NA "M" "M" NA ...
## $ PERP_RACE : chr [1:25596] NA "ASIAN / PACIFIC ISLANDER" "BLACK" NA ...
## $ VIC_AGE_GROUP : chr [1:25596] "18-24" "25-44" "25-44" "25-44" ...
## $ VIC_SEX : chr [1:25596] "M" "M" "M" "M" ...
## $ VIC_RACE : chr [1:25596] "BLACK" "ASIAN / PACIFIC ISLANDER" "BLACK" "BLACK" ...
## $ X_COORD_CD : num [1:25596] 996313 981845 996546 1001139 1050710 ...
## $ Y_COORD_CD : num [1:25596] 187499 171118 187436 192775 184826 ...
## $ Latitude : num [1:25596] 40.7 40.6 40.7 40.7 40.7 ...
## $ Longitude : num [1:25596] -74 -74 -74 -73.9 -73.8 ...
## $ Lon_Lat : chr [1:25596] "POINT (-73.95650899099996 40.68131820000008)" "POINT (-74.00866668999998 40.63636384100005)" "POINT (-73.95566903799994 40.68114495900005)" "POINT (-73.939095905 40.69579171600003)" ...
# Visualize the distribution of perp age
plot(as.factor(nypd_shootings$PERP_AGE_GROUP))
# Remove invalid perp age values
nypd_shootings$PERP_AGE_GROUP[nypd_shootings$PERP_AGE_GROUP %in% c(1020, 224, 940)] <- NA
# Define the list of variables that should be factors
factor_vars <- c('BORO'
,'PRECINCT'
,'JURISDICTION_CODE'
,'LOCATION_DESC'
,'PERP_AGE_GROUP'
,'PERP_RACE'
,'PERP_SEX'
,'VIC_AGE_GROUP'
,'VIC_RACE'
,'VIC_SEX')
# Convert variables to factors
nypd_shootings[factor_vars] <- lapply(nypd_shootings[factor_vars], as.factor)
# Convert OCCUR_DATE to date
nypd_shootings$OCCUR_DATE <- as.Date(nypd_shootings$OCCUR_DATE, '%m/%d/%Y')
# Create a datetime variable
nypd_shootings$OCCUR_DT <- nypd_shootings %>%
select(OCCUR_DATE, OCCUR_TIME) %>%
mutate(OCCUR_DT = as.POSIXct(paste(OCCUR_DATE, OCCUR_TIME), format = '%Y-%m-%d %H:%M:%S')) %>%
select(OCCUR_DT)
# Get new summaries
summary(nypd_shootings)
## INCIDENT_KEY OCCUR_DATE OCCUR_TIME
## Min. : 9953245 Min. :2006-01-01 Length:25596
## 1st Qu.: 61593633 1st Qu.:2009-05-10 Class1:hms
## Median : 86437258 Median :2012-08-26 Class2:difftime
## Mean :112382648 Mean :2013-06-13 Mode :numeric
## 3rd Qu.:166660833 3rd Qu.:2017-07-01
## Max. :238490103 Max. :2021-12-31
##
## BORO PRECINCT JURISDICTION_CODE
## BRONX : 7402 75 : 1470 0 :21321
## BROOKLYN :10365 73 : 1372 1 : 59
## MANHATTAN : 3265 67 : 1160 2 : 4214
## QUEENS : 3828 79 : 982 NA's: 2
## STATEN ISLAND: 736 44 : 949
## 47 : 903
## (Other):18760
## LOCATION_DESC STATISTICAL_MURDER_FLAG PERP_AGE_GROUP
## MULTI DWELL - PUBLIC HOUS: 4559 Mode :logical <18 :1463
## MULTI DWELL - APT BUILD : 2664 FALSE:20668 18-24 :5844
## PVT HOUSE : 893 TRUE :4928 25-44 :5202
## GROCERY/BODEGA : 622 45-64 : 535
## BAR/NIGHT CLUB : 588 65+ : 57
## (Other) : 1293 UNKNOWN:3148
## NA's :14977 NA's :9347
## PERP_SEX PERP_RACE VIC_AGE_GROUP VIC_SEX
## F : 371 BLACK :10668 <18 : 2681 F: 2403
## M :14416 WHITE HISPANIC: 2164 18-24 : 9604 M:23182
## U : 1499 UNKNOWN : 1836 25-44 :11386 U: 11
## NA's: 9310 BLACK HISPANIC: 1203 45-64 : 1698
## WHITE : 272 65+ : 167
## (Other) : 143 UNKNOWN: 60
## NA's : 9310
## VIC_RACE X_COORD_CD Y_COORD_CD
## AMERICAN INDIAN/ALASKAN NATIVE: 9 Min. : 914928 Min. :125757
## ASIAN / PACIFIC ISLANDER : 354 1st Qu.:1000011 1st Qu.:182782
## BLACK :18281 Median :1007715 Median :194038
## BLACK HISPANIC : 2485 Mean :1009455 Mean :207894
## UNKNOWN : 65 3rd Qu.:1016838 3rd Qu.:239429
## WHITE : 660 Max. :1066815 Max. :271128
## WHITE HISPANIC : 3742
## Latitude Longitude Lon_Lat
## Min. :40.51 Min. :-74.25 Length:25596
## 1st Qu.:40.67 1st Qu.:-73.94 Class :character
## Median :40.70 Median :-73.92 Mode :character
## Mean :40.74 Mean :-73.91
## 3rd Qu.:40.82 3rd Qu.:-73.88
## Max. :40.91 Max. :-73.70
##
## OCCUR_DT.OCCUR_DT
## Min. :2006-01-01 02:00:00.00
## 1st Qu.:2009-05-10 04:05:00.00
## Median :2012-08-26 01:05:00.00
## Mean :2013-06-14 04:24:56.25
## 3rd Qu.:2017-07-01 00:20:15.00
## Max. :2021-12-31 19:23:00.00
##
str(nypd_shootings, give.attr = FALSE)
## spc_tbl_ [25,596 × 20] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ INCIDENT_KEY : num [1:25596] 2.36e+08 2.31e+08 2.31e+08 2.38e+08 2.24e+08 ...
## $ OCCUR_DATE : Date[1:25596], format: "2021-11-11" "2021-07-16" ...
## $ OCCUR_TIME : 'hms' num [1:25596] 15:04:00 22:05:00 01:09:00 13:42:00 ...
## $ BORO : Factor w/ 5 levels "BRONX","BROOKLYN",..: 2 2 2 2 4 4 1 1 3 2 ...
## $ PRECINCT : Factor w/ 77 levels "1","5","6","7",..: 51 45 51 52 71 71 25 34 22 47 ...
## $ JURISDICTION_CODE : Factor w/ 3 levels "0","1","2": 1 1 1 1 1 1 1 1 1 1 ...
## $ LOCATION_DESC : Factor w/ 39 levels "ATM","BANK","BAR/NIGHT CLUB",..: NA NA NA NA NA NA 9 NA NA NA ...
## $ STATISTICAL_MURDER_FLAG: logi [1:25596] FALSE FALSE FALSE FALSE FALSE TRUE ...
## $ PERP_AGE_GROUP : Factor w/ 6 levels "<18","18-24",..: NA 4 1 NA NA NA NA NA NA 3 ...
## $ PERP_SEX : Factor w/ 3 levels "F","M","U": NA 2 2 NA NA NA NA NA NA 2 ...
## $ PERP_RACE : Factor w/ 7 levels "AMERICAN INDIAN/ALASKAN NATIVE",..: NA 2 3 NA NA NA NA NA NA 4 ...
## $ VIC_AGE_GROUP : Factor w/ 6 levels "<18","18-24",..: 2 3 3 3 3 3 2 3 3 3 ...
## $ VIC_SEX : Factor w/ 3 levels "F","M","U": 2 2 2 2 2 2 2 2 2 2 ...
## $ VIC_RACE : Factor w/ 7 levels "AMERICAN INDIAN/ALASKAN NATIVE",..: 3 2 3 3 3 3 3 3 4 7 ...
## $ X_COORD_CD : num [1:25596] 996313 981845 996546 1001139 1050710 ...
## $ Y_COORD_CD : num [1:25596] 187499 171118 187436 192775 184826 ...
## $ Latitude : num [1:25596] 40.7 40.6 40.7 40.7 40.7 ...
## $ Longitude : num [1:25596] -74 -74 -74 -73.9 -73.8 ...
## $ Lon_Lat : chr [1:25596] "POINT (-73.95650899099996 40.68131820000008)" "POINT (-74.00866668999998 40.63636384100005)" "POINT (-73.95566903799994 40.68114495900005)" "POINT (-73.939095905 40.69579171600003)" ...
## $ OCCUR_DT : tibble [25,596 × 1] (S3: tbl_df/tbl/data.frame)
## ..$ OCCUR_DT: POSIXct[1:25596], format: "2021-11-11 15:04:00" "2021-07-16 22:05:00" ...
# Get a count of NAs per each variable
sapply(nypd_shootings, function(x) sum(is.na(x)))
## INCIDENT_KEY OCCUR_DATE OCCUR_TIME
## 0 0 0
## BORO PRECINCT JURISDICTION_CODE
## 0 0 2
## LOCATION_DESC STATISTICAL_MURDER_FLAG PERP_AGE_GROUP
## 14977 0 9347
## PERP_SEX PERP_RACE VIC_AGE_GROUP
## 9310 9310 0
## VIC_SEX VIC_RACE X_COORD_CD
## 0 0 0
## Y_COORD_CD Latitude Longitude
## 0 0 0
## Lon_Lat OCCUR_DT
## 0 0
# How many incidents lack demographic data on perp
sum(is.na(nypd_shootings$PERP_AGE_GROUP) |
is.na(nypd_shootings$PERP_SEX) |
is.na(nypd_shootings$PERP_RACE))
## [1] 9347
Approximately 60% of incidents lack a value for LOCATION_DESC, which will hurt its utility. Likely it will not be worth ignoring these incidents and so some form of imputation would be necessary to work with this variable. Additionally, over 1/3 of incidents lack demographic data (age, sex, race) on the perpetrator. I do not plan to impute these details.
# Get the distribution of victims by age and sex; most victims are male
nypd_shootings %>%
select(VIC_AGE_GROUP, VIC_SEX) %>%
crosstable(., VIC_SEX, by = VIC_AGE_GROUP, total = 'both', #showNA = 'always',
percent_digits = 1, percent_pattern = '{n} ({p_col}/{p_row})') %>%
as_flextable(keep_id = TRUE)
.id | label | variable | VIC_AGE_GROUP | Total | |||||
|---|---|---|---|---|---|---|---|---|---|
<18 | 18-24 | 25-44 | 45-64 | 65+ | UNKNOWN | ||||
VIC_SEX | VIC_SEX | F | 376 (14.0%/15.6%) | 732 (7.6%/30.5%) | 914 (8.0%/38.0%) | 322 (19.0%/13.4%) | 54 (32.3%/2.2%) | 5 (8.3%/0.2%) | 2403 (9.4%) |
M | 2305 (86.0%/9.9%) | 8868 (92.3%/38.3%) | 10470 (92.0%/45.2%) | 1376 (81.0%/5.9%) | 113 (67.7%/0.5%) | 50 (83.3%/0.2%) | 23182 (90.6%) | ||
U | 0 (0%/0%) | 4 (0.04%/36.4%) | 2 (0.02%/18.2%) | 0 (0%/0%) | 0 (0%/0%) | 5 (8.3%/45.5%) | 11 (0.04%) | ||
Total | 2681 (10.5%) | 9604 (37.5%) | 11386 (44.5%) | 1698 (6.6%) | 167 (0.7%) | 60 (0.2%) | 25596 (100.0%) | ||
# Create a very simple linear model for victim sex based on victim age group
lm_age <- nypd_shootings %>%
mutate(male_vic = if_else(VIC_SEX == 'M', 1, 0)) %>%
filter(OCCUR_DATE >= as.Date('2017-01-01')) %>%
lm(formula = male_vic ~ VIC_AGE_GROUP)
summary(lm_age)
##
## Call:
## lm(formula = male_vic ~ VIC_AGE_GROUP, data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.91433 0.08567 0.08567 0.09361 0.33333
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.906746 0.013279 68.282 < 2e-16 ***
## VIC_AGE_GROUP18-24 -0.000361 0.014799 -0.024 0.9805
## VIC_AGE_GROUP25-44 0.007579 0.014171 0.535 0.5928
## VIC_AGE_GROUP45-64 -0.105717 0.018133 -5.830 5.79e-09 ***
## VIC_AGE_GROUP65+ -0.240079 0.046383 -5.176 2.33e-07 ***
## VIC_AGE_GROUPUNKNOWN -0.240079 0.100258 -2.395 0.0167 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2981 on 6848 degrees of freedom
## Multiple R-squared: 0.01537, Adjusted R-squared: 0.01465
## F-statistic: 21.38 on 5 and 6848 DF, p-value: < 2.2e-16
# Show the time series of shooting incidents by month, from Jan'06 to Dec'21
nypd_shootings %>%
mutate(occur_month = ceiling_date(OCCUR_DATE, unit = 'month') - 1) %>%
group_by(occur_month) %>%
arrange(occur_month) %>%
summarise(incidents = n()) %>%
select(incidents) %>%
ts(., start = c(2006, 1), end = c(2021, 12), frequency = 12) %>%
ts_plot(title = 'NYC Shooting Incidents by Month',
Xtitle = '# Shooting Incidents',
Ytitle = 'Incident Month',
line.mode = 'lines+markers',
Xgrid = TRUE, Ygrid = TRUE)
# Zoom in on the latest 6 years of data and visualize YoY behavior
nypd_shootings %>%
filter(OCCUR_DATE >= as.Date('2016-01-01')) %>%
mutate(occur_month = ceiling_date(OCCUR_DATE, unit = 'month') - 1) %>%
group_by(occur_month) %>%
arrange(occur_month) %>%
summarise(incidents = n()) %>%
mutate(
occur_year = year(occur_month),
occur_month = month(occur_month, label = TRUE, abbr = TRUE)
) %>%
ggplot(., aes(occur_month, incidents, group = factor(occur_year))) +
geom_point(aes(shape = factor(occur_year))) +
geom_line()
While no victim cross-section of sex and age make up a majority of
these incidents, the largest group, males age 25-44, make up ~41% of all
victims. The second most common group are males ages 18-24, making up
~35% of all victims. That is, just over 75% of all victims are males
ages 18-44. Based on this, it is not surprising to see that the simple
linear model used to predict the sex of the victim (1 male, 0 otherwise)
has a very high intercept.
The time series data show clearly that in summer 2020, NYC saw a large
increase in shooting incidents that reversed the previous ~3 years of
low rates (low rates 2017 - 2019). This increase was sustained through
the end of 2021. Looking more closely at these data, we see that 2020
breaks with the previous years beginning in May. This increased
frequency continues through the end of the reporting period (though it
is interesting to note that Jan’21 appears to have been a typical
month).
In reviewing these data on shooting incidents in New York City, we
have noted three high-level details.
1. Data on the perpetrator (race, age, etc.) are often missing
2. Most shooting victims are male, between ages 18 - 44
3. Shooting incident frequency peaked in the summer of 2020, and
remained elevated through Dec’21, following three years of below typical
rates
Many sources of potential bias exist with these data and any associated analysis. In the data we must consider issues such as social and political biases that impact the collection of data, such as which areas/neighborhoods are policed and biases in reporting. For example, are all victims equally likely to report the incident and/or work with the police? When details are provided, such as perpetrator race, age, etc., how reliable are these details? It is well known that our recall of stressful events is often poor. My own personal bias is to disregard demographic data on the perpetrator(s) as I am suspicious of the data and because I am hesitant to produce a conclusion that might align with preconceived notions of the type of person who commits violent crime. Additionally, I believe it would be difficult to avoid introducing personal bias in developing a conclusion around the spike in crime beginning in summer 2020. To mitigate this I would look to leverage additional data to isolate what made 2020 - 2021 different from previous years, avoiding armchair hypotheses and easy explanations that come from having lived through the period (“it’s COVID madness”).